library(tidyverse)
library(ggpubr)
devtools::load_all()
set.seed(44)
library(RColorBrewer)
Simulate the data
ncells <- 20000
nmarkers <- 20
beta_size <- 3
sigma_alpha <- 2
sigma_epsilon <- 1
# Biology design matrix
X <- matrix(c(rep(0,ncells/2), rep(1,ncells/2)), ncells, 1)
# Global mean (similair overall shift to CyTOF data)
mu <- matrix(10, ncells, nmarkers)
# Coefficients of interest
beta <- matrix(0, 1, nmarkers)
beta[c(1,3,5)] <- rep(beta_size, 3)
# Unwanted variation design matrix
# Note: This is UNCORRELATED with X
W <- matrix(rbinom(ncells, 1, 0.5), ncells, 1)
# Coeffcients of unwanted variation
alpha <- matrix(rnorm(nmarkers, sd = sigma_alpha), 1, nmarkers)
# Random error
epsilon <- matrix(rnorm(ncells*nmarkers, sd = sigma_epsilon), ncells, nmarkers)
# RUVIII model
Y <- mu + X %*% beta + W %*% alpha + epsilon
Annotate the simulation data
colnames(Y) <- paste0("Marker", 1:nmarkers)
sample_id <- rep(LETTERS[1:2], each = ncells/2)
cluster_id <- rep(NA, ncells) # Add nonsense to make correct format
data <- data.frame(sample = sample_id, cluster = cluster_id, Y)
data$cluster <- cluster_FlowSOM(data, 4)
Plot markers
plot_marker_boxplots(data)
plot_marker_boxplots_samp(data)
PCA plots
pca <- prcomp(as.matrix(data[, 3:ncol(data)]))
# Look at to what extent the pca vectors captures batch effect
plots <- list(qplot(1:ncells, pca$x[,1]) + labs(x = "Index", y = "PCA1") + theme_bw(),
qplot(1:ncells, pca$x[,2]) + labs(x = "Index", y = "PCA2") + theme_bw(),
qplot(1:ncells, pca$x[,3]) + labs(x = "Index", y = "PCA3") + theme_bw(),
qplot(1:ncells, pca$x[,4]) + labs(x = "Index", y = "PCA4") + theme_bw())
ggarrange(plotlist = plots)
ggarrange(plotlist = plot_scpca_samp(data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)
ggarrange(plotlist = plot_scpca_clus(data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)
tSNE plots
tsne_data <- compute_tsne(data, 1000)
## Warning: no DISPLAY variable so Tk is not available
## Running t-SNE...with seed 42 DONE
plot_tsne_sample(tsne_data)
plot_tsne_cluster(tsne_data)
#plot_tsne_marker(data, tsne_data)
Normalise the data
nrep_groups <- 2
rand <- sample(c(0,1), ncells, replace = TRUE)
# random pseudo replicates
#M <- cbind(rand, !rand)
# exactly wrong psuedo replicates
M <- cbind(W, !W)
# perfect psedo replicates
#M <- cbind(X, !X)
# Check ruv documentation condition satisfied
any(rowSums(M) == 0)
## [1] FALSE
# Standardise the cell data
# Could refactor RUVIII wrappers to make this easier
raw_Y <- as.matrix(data[ ,3:ncol(data)])
col_means <- colMeans(raw_Y)
col_sds <- apply(raw_Y, 2, function(x) sd(x))
for(i in 1:ncol(raw_Y)){
raw_Y[,i] <- (raw_Y[,i] - col_means[i])/col_sds[i]
}
exclude_neg_controls <- c(1,3,5)
controls <- c(1:nmarkers)[-exclude_neg_controls]
RUVIIout <- fastRUVIII(Y = Y, M, ctl = controls, k = 1)
norm_Y <- RUVIIout$newY
alpha <- RUVIIout$fullalpha
for(i in 1:ncol(norm_Y)){
norm_Y[,i] <- norm_Y[,i]*col_sds[i] + col_means[i]
}
norm_data <- data
norm_data[3:ncol(norm_data)] <- norm_Y
#pheatmap::pheatmap(alpha, cluster_cols = FALSE, cluster_rows = FALSE)
#RMY <- Y - M %*% solve(t(M) %*% M) %*% t(M) %*% Y
#pca <- prcomp(RMY)$x
#qplot(pca[,1], pca[,2]) + labs(x = "PCA1", y = "PCA2") + theme_bw()
Plot markers
plot_marker_boxplots(norm_data)
plot_marker_boxplots_samp(norm_data)
PCA plots
ggarrange(plotlist = plot_scpca_samp(norm_data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)
tSNE plots
tsne_data <- compute_tsne(norm_data, 1000)
## Running t-SNE...with seed 42 DONE
plot_tsne_sample(tsne_data)
plot_tsne_cluster(tsne_data)